Take-home Exercise 3

Author

SMLurker

Getting Started

Background

Data Wrangling

Importing R pacakges

pacman::p_load(jsonlite,tidygraph,ggraph,visNetwork,graphlayouts,ggforce,skimr, 
               tidytext,tidyverse,ggstatsplot,ggiraph)

Importing JSON file by using jsonlite packages

mc3_data <- fromJSON("data/MC3.json")

Extracting edges

mc3_edges <- as_tibble(mc3_data$links) %>%
  distinct() %>%
  mutate(source = as.character(source),
         target = as.character(target),
         type = as.character(type)) %>%
  group_by(source, target, type) %>%
  summarise(weights=n()) %>%
  filter(source!=target) %>%
  ungroup()

Extracting nodes

mc3_nodes <- as_tibble(mc3_data$nodes) %>%
  mutate(country = as.character(country),
         id = as.character(id),
         product_services = as.character(product_services),
         revenue_omu = as.numeric(as.character(revenue_omu)),
         type = as.character(type)) %>%
  select(id, country, type, revenue_omu, product_services)

Exploring the edges data frame

Show the code
skim(mc3_edges)
Data summary
Name mc3_edges
Number of rows 24036
Number of columns 4
_______________________
Column type frequency:
character 3
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
source 0 1 6 700 0 12856 0
target 0 1 6 28 0 21265 0
type 0 1 16 16 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
weights 0 1 1 0 1 1 1 1 1 ▁▁▇▁▁
Edge table
Show the code
DT::datatable(mc3_edges)
Plotting by type
Show the code
ggplot(data = mc3_edges,
       aes(x = type)) +
  geom_bar()

Exploring the nodes data frame

Show the code
skim(mc3_nodes)
Data summary
Name mc3_nodes
Number of rows 27622
Number of columns 5
_______________________
Column type frequency:
character 4
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
id 0 1 6 64 0 22929 0
country 0 1 2 15 0 100 0
type 0 1 7 16 0 3 0
product_services 0 1 4 1737 0 3244 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
revenue_omu 21515 0.22 1822155 18184433 3652.23 7676.36 16210.68 48327.66 310612303 ▇▁▁▁▁
Node table
Show the code
DT::datatable(mc3_nodes)
Plotting by type
Show the code
ggplot(data = mc3_nodes,
       aes(x = type)) +
  geom_bar()

Visualisation and Analysis

Building network model

id1 <- mc3_edges %>%
  select(source) %>%
  rename(id = source)
id2 <- mc3_edges %>%
  select(target) %>%
  rename(id = target)
mc3_nodes1 <- rbind(id1, id2) %>%
  distinct() %>%
  left_join(mc3_nodes,
            unmatched = "drop")
mc3_graph <- tbl_graph(nodes = mc3_nodes1,
                       edges = mc3_edges,
                       directed = FALSE) %>%
  mutate(betweenness_centrality = centrality_betweenness(),
         closeness_centrality = centrality_closeness())

mc3_graph %>%
  filter(betweenness_centrality >= 100000) %>%
ggraph(layout = "fr") +
  geom_edge_link(aes(alpha=0.5)) +
  geom_node_point(aes(
    size = betweenness_centrality,
    colors = "lightblue",
    alpha = 0.5)) +
  scale_size_continuous(range=c(1,10))+
  theme_graph()

Text Sensing

Simple word count
mc3_nodes %>% 
    mutate(n_fish = str_count(product_services, "fish")) 
# A tibble: 27,622 × 6
   id                          country type  revenue_omu product_services n_fish
   <chr>                       <chr>   <chr>       <dbl> <chr>             <int>
 1 Jones LLC                   ZH      Comp…  310612303. Automobiles           0
 2 Coleman, Hall and Lopez     ZH      Comp…  162734684. Passenger cars,…      0
 3 Aqua Advancements Sashimi … Oceanus Comp…  115004667. Holding firm wh…      0
 4 Makumba Ltd. Liability Co   Utopor… Comp…   90986413. Car service, ca…      0
 5 Taylor, Taylor and Farrell  ZH      Comp…   81466667. Fully electric …      0
 6 Harmon, Edwards and Bates   ZH      Comp…   75070435. Discount superm…      0
 7 Punjab s Marine conservati… Riodel… Comp…   72167572. Beef, pork, chi…      0
 8 Assam   Limited Liability … Utopor… Comp…   72162317. Power and Gas s…      0
 9 Ianira Starfish Sagl Import Rio Is… Comp…   68832979. Light commercia…      0
10 Moran, Lewis and Jimenez    ZH      Comp…   65592906. Automobiles, tr…      0
# ℹ 27,612 more rows
Tokenisation
token_nodes <- mc3_nodes %>%
  unnest_tokens(word, 
                product_services)

token_nodes %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in product_services field")

Removing stopwords
select nodes with “seafood”,“fish”,“carp”,“catfish”,“herring”,“mackerel”,“pollock”,“salmon”,“shark”,“tuna” as part of the product service
stopwords_removed <- token_nodes %>% 
  anti_join(stop_words) %>%
  filter(word %in% c("seafood","fish","carp","catfish","herring","mackerel","pollock","salmon","shark","tuna")) %>%
  distinct()
stopwords_removed %>%
  count(word, sort = TRUE) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in product_services field")

Plotting by type
Show the code
ggplot(data = stopwords_removed,
       aes(x = type)) +
  geom_bar()+
  geom_text(stat="count", 
      aes(label=paste0(after_stat(count))),vjust=-1)+
  ylim(0,1500)

Analyze company type

Since from the above, the number of company type is much greater than the other 2 types, we will focus on the company type and find the distribution of revenue_omu
Show the code
clean_nodes_c <-stopwords_removed %>%
  drop_na(revenue_omu) %>%
  filter(type=="Company")

set.seed(1234)

gghistostats(
  data = clean_nodes_c,
  x = revenue_omu,
  type = "bayes",
  test.value = 60,
  xlab = "revenue_omu"
)

Most of the companies have a revenue_omu within the first bar, but there are some companies that have far more revenue than others, we select the revenue_omu>400,000
Show the code
df_nodes <- clean_nodes_c %>% 
  filter(revenue_omu>200000)
df_edges <- mc3_edges %>%
  filter(source %in% df_nodes$id)

id3 <- df_edges %>%
  select(source) %>%
  rename(id = source)
id4 <- df_edges %>%
  select(target) %>%
  rename(id = target)
df_nodes_1 <- rbind(id3, id4) %>%
  distinct() %>%
  left_join(mc3_nodes,
            unmatched = "drop")

df_graph <- tbl_graph(nodes = df_nodes_1,
                       edges = df_edges,
                       directed = FALSE) %>%
  mutate(betweenness_centrality = centrality_betweenness(),
         closeness_centrality = centrality_closeness())

g <- df_graph %>%
  mutate(betweenness_centrality = centrality_betweenness()) %>%
  ggraph(layout = "kk") + 
  geom_edge_link(aes(width=weights), 
                 alpha=0.2) +
  scale_edge_width(range = c(0.01, 0.1)) +
  geom_node_point(aes(colour = country,
            size=betweenness_centrality))
g + theme_graph()

The companies in country ZH seems to have a high revenue_omu but they dont have many business partner records

Grouping

Calculate partner numbers (numbers of targets of a source), and assign partner = -1 if targets dont have a partner record, we only select those with a partner and group them by revenue_omu and partner numbers.
Show the code
df_edges_1 <- mc3_edges %>%
  filter(source %in% clean_nodes_c$id)

df_edges_1r <- df_edges_1 %>%
  group_by(source) %>%
  summarize(partners=n_distinct(target)) %>%
  rename(id=source) %>%
  ungroup()

df_nodes_2 <- clean_nodes_c %>% 
  left_join(df_edges_1r) %>%
  distinct()

df_nodes_2$partners[is.na(df_nodes_2$partners)] <- -1

df_nodes_2$group[(df_nodes_2$partners>quantile(df_nodes_2$partners, 0.5)) & (df_nodes_2$revenue_omu<=quantile(df_nodes_2$revenue_omu, 0.8))] <- 1
df_nodes_2$group[(df_nodes_2$partners>quantile(df_nodes_2$partners, 0.5)) & (df_nodes_2$revenue_omu>quantile(df_nodes_2$revenue_omu, 0.8))] <- 2
df_nodes_2$group[(df_nodes_2$partners<=quantile(df_nodes_2$partners, 0.5)) & (df_nodes_2$revenue_omu<=quantile(df_nodes_2$revenue_omu, 0.8))] <- 3
df_nodes_2$group[(df_nodes_2$partners<=quantile(df_nodes_2$partners, 0.5)) & (df_nodes_2$revenue_omu>quantile(df_nodes_2$revenue_omu, 0.8))] <- 4
df_nodes_2$group[(df_nodes_2$partners==-1) & (df_nodes_2$revenue_omu<=quantile(df_nodes_2$revenue_omu, 0.8))] <- 5
df_nodes_2$group[(df_nodes_2$partners==-1) & (df_nodes_2$revenue_omu>quantile(df_nodes_2$revenue_omu, 0.8))] <- 6


df_nodes_2 <- df_nodes_2[,!names(df_nodes_2) %in% 
      c("word")] %>%
  distinct() 

set.seed(1234)

gghistostats(
  data = df_nodes_2[df_nodes_2$partners>0,],
  x = partners,
  type = "bayes",
  test.value = 60,
  xlab = "partners")

Here we define Group 1 as No. of partners > 50% and revenue <= 80%
Group 2 as No. of partners > 50% and revenue > 80%
Group 3 as No. of partners <= 50% and revenue <= 80%
Group 4 as No. of partners <= 50% and revenue > 80%
Group 5 and 6 are groups with no partners, but revenue less than or equal to 80% and revenue greater than 80%
We then visualize the nodes and edges and since Group 5 and 6 dont have partners, they will not appear in the network. Group 4 is selected since they dont have many partners in business but they have high revenue_omu
Show the code
df_edges_2 <- mc3_edges %>%
  filter(source %in% df_nodes_2[df_nodes_2$group==4,]$id)

id5 <- df_edges_2 %>%
  select(source) %>%
  rename(id = source)
id6 <- df_edges_2 %>%
  select(target) %>%
  rename(id = target)
df_nodes_3 <- rbind(id5, id6) %>%
  distinct() %>%
  left_join(mc3_nodes,
            unmatched = "drop") %>%
  left_join(df_nodes_2)


df_graph_3 <- tbl_graph(nodes = df_nodes_3,
                       edges = df_edges_2,
                       directed = FALSE) %>%
  mutate(betweenness_centrality = centrality_betweenness(),
         closeness_centrality = centrality_closeness())

g_3 <- df_graph_3 %>%
  mutate(betweenness_centrality = centrality_betweenness()) %>%
  ggraph(layout = "kk") + 
  geom_edge_link(aes(width=weights), 
                 alpha=0.2) +
  scale_edge_width(range = c(0.01, 0.1)) +
  geom_node_text(aes(label = ifelse(group > 1, as.character(id), "")), size = 2)+
  geom_node_point(aes(colour = group,
            size=betweenness_centrality))
g_3 + theme_graph()

Conclusion

From the above, we should pay attention to Group 4 and 6 since they dont have many partners in business but they have high revenue_omu, which probably because they do not follow the law and do illegal fishing to gain high revenue.